from gensim.models import Word2Vec, word2vec, KeyedVectors
import os

# csdn word to vector ; return model
def csdn_word2vector(cut_datas_path, save_model_path=None, size=100, min_count=5, window=5):
    '''
    :param cut_datas_path:
    :param size:    vector length
    :param min_count:   min_word nums
    :param window:  slide width
    :param save_model_path: will to save path for model
    :return:
    '''
    sentences = word2vec.PathLineSentences(cut_datas_path, limit=None)

    model = Word2Vec(sentences, size=size, min_count=min_count, window=window)

    # model save
    if save_model_path:
        model.save(os.path.join(save_model_path, 'save_model', 'model'))
        model.wv.save_word2vec_format(os.path.join(save_model_path, 'save_vector', 'vector'))

    return model

# 返回模型和词向量表
def reload_csdn_model(save_model_path):
    model = Word2Vec.load(os.path.join(save_model_path, 'save_model', 'model'))     # 模型
    vocab_vector_scale = KeyedVectors.load_word2vec_format(os.path.join(save_model_path, 'save_vector', 'vector'))  # 词向量表

    return model, vocab_vector_scale

if __name__ == '__main__':
    cut_datas_path = r'E:\NLP1\骚操作\自学分词\csdn语料库训练\csdn_cut_datas'
    save_model_path = r'E:\NLP1\骚操作\自学分词\csdn语料库训练\csdn_w2v_model'
    model = csdn_word2vector(cut_datas_path, save_model_path)
    print(model['文件'])